date: 2018-10-29

R setup

knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 9)

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(knitr)
library(readxl)
library(moments)
library(epiR)
## Warning: package 'epiR' was built under R version 3.4.4
## Loading required package: survival
## Package epiR 0.9-97 is loaded
## Type help(epi.about) for summary information
## 
library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness
library(lsmeans)
## Warning: package 'lsmeans' was built under R version 3.4.4
## The 'lsmeans' package is being deprecated.
## Users are encouraged to switch to 'emmeans'.
## See help('transition') for more information, including how
## to convert 'lsmeans' objects and scripts to work with 'emmeans'.
library(lmSupport)
## Warning: package 'lmSupport' was built under R version 3.4.4
library(ppcor)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(tableone)
## Warning: package 'tableone' was built under R version 3.4.4
library(sjPlot)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.12
## Current Matrix version is 1.2.13
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(leaps)

Load data

rm(list = ls())

## define directory
analysisDir <- '/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/'
  
## Read in data
cost <- read_excel("/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/cost.xls")

Data structure

head(cost)
## # A tibble: 6 x 14
##   outcosts2 ptage   ssi panic anxiety married female educat  iadl social
##       <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1       83.  38.9  1.00    0.      0.      1.     1.     5. 100.    100.
## 2     2162.  53.3  1.00    0.      0.      0.     1.     2.  91.7   100.
## 3     2203.  42.7  1.08    0.      0.      0.     1.     1.  75.0   100.
## 4        0.  70.2  1.15    0.      0.      0.     1.     1.  46.7   100.
## 5     4050.  41.1  1.00    0.      0.      1.     1.     3. 100.    100.
## 6      815.  38.4  1.15    0.      0.      0.     1.     2.  83.3   100.
## # ... with 4 more variables: racecat <dbl>, whiteleycat <dbl>,
## #   charlson <dbl>, depcat <dbl>
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame':    461 obs. of  14 variables:
##  $ outcosts2  : num  83 2162 2203 0 4050 ...
##  $ ptage      : num  38.9 53.3 42.7 70.2 41.1 ...
##  $ ssi        : num  1 1 1.08 1.15 1 ...
##  $ panic      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anxiety    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ married    : num  1 0 0 0 1 0 1 0 0 0 ...
##  $ female     : num  1 1 1 1 1 1 0 1 0 0 ...
##  $ educat     : num  5 2 1 1 3 2 5 3 2 2 ...
##  $ iadl       : num  100 91.7 75 46.7 100 ...
##  $ social     : num  100 100 100 100 100 ...
##  $ racecat    : num  4 2 2 1 1 2 4 2 1 1 ...
##  $ whiteleycat: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ charlson   : num  0 1 1 2 0 3 0 2 0 1 ...
##  $ depcat     : num  3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     307    1016    1836    2463   11909 
## 
## $ptage
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.96   32.24   44.62   45.16   55.21   89.68       1 
## 
## $ssi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.231   1.538   1.738   2.077   5.000       6 
## 
## $panic
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000 
## 
## $anxiety
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000 
## 
## $married
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4096  1.0000  1.0000       2 
## 
## $female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.744   1.000   1.000 
## 
## $educat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   3.000   3.359   5.000   5.000       1 
## 
## $iadl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   72.22   94.44   81.82  100.00  100.00 
## 
## $social
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00  100.00  100.00   89.86  100.00  100.00       2 
## 
## $racecat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.297   4.000   4.000 
## 
## $whiteleycat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.456   2.000   3.000 
## 
## $charlson
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5662  1.0000  7.0000 
## 
## $depcat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    3.00    2.72    3.00    3.00

Define variables

The goal for this assignment is to build a linear regression model using the priniciples discussed in class, and then write it into a methods and results section of a manuscript.


removevars <- c()

## Define response variables
responseVars <- c('outcosts2')
responseVars
## [1] "outcosts2"
## Define explanatory variables
explanVars <- names(cost)[!names(cost) %in% c("outcosts2")]
explanVars
##  [1] "ptage"       "ssi"         "panic"       "anxiety"     "married"    
##  [6] "female"      "educat"      "iadl"        "social"      "racecat"    
## [11] "whiteleycat" "charlson"    "depcat"
## List of quantitative variables
quantVars <- names(cost[!names(cost) %in% removevars &
                                  sapply(cost, function(x)
                                      length(levels(as.factor(x)))>7)])
quantVars
## [1] "outcosts2" "ptage"     "ssi"       "iadl"      "social"    "charlson"
## List of binary variables
binVars <- names(cost[!names(cost) %in% removevars &
                                sapply(cost, function(x)
                                    length(levels(as.factor(x)))==2)])
binVars
## [1] "panic"   "anxiety" "married" "female"
## List of categorical variables

catVars <- names(cost[!names(cost) %in% c(removevars,binVars) &
                                sapply(cost, function(x)
                                    length(levels(as.factor(x)))<=7 & length(levels(as.factor(x)))>1)])
catVars
## [1] "educat"      "racecat"     "whiteleycat" "depcat"
## Get Intersection between quantVars and explanVars and responseVars
explanVars.quant <- intersect(explanVars,quantVars)
responseVars.quant <- intersect(responseVars,quantVars)

## Get intersection between categorical and binary variables with explanatory and response vars
explanVars.cat <- intersect(explanVars,catVars)
explanVars.cat <- explanVars.cat[!explanVars.cat %in% c(binVars)]

responseVars.cat <- intersect(responseVars, catVars)
responseVars.cat <- responseVars.cat[!responseVars.cat %in% c(binVars)]


## Get binary explanatory and response variables
explanVars.bin <- intersect(explanVars, binVars)
responseVars.bin <- intersect(responseVars, binVars)

## View data ranges
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame':    461 obs. of  14 variables:
##  $ outcosts2  : num  83 2162 2203 0 4050 ...
##  $ ptage      : num  38.9 53.3 42.7 70.2 41.1 ...
##  $ ssi        : num  1 1 1.08 1.15 1 ...
##  $ panic      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anxiety    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ married    : num  1 0 0 0 1 0 1 0 0 0 ...
##  $ female     : num  1 1 1 1 1 1 0 1 0 0 ...
##  $ educat     : num  5 2 1 1 3 2 5 3 2 2 ...
##  $ iadl       : num  100 91.7 75 46.7 100 ...
##  $ social     : num  100 100 100 100 100 ...
##  $ racecat    : num  4 2 2 1 1 2 4 2 1 1 ...
##  $ whiteleycat: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ charlson   : num  0 1 1 2 0 3 0 2 0 1 ...
##  $ depcat     : num  3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     307    1016    1836    2463   11909 
## 
## $ptage
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.96   32.24   44.62   45.16   55.21   89.68       1 
## 
## $ssi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.231   1.538   1.738   2.077   5.000       6 
## 
## $panic
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000 
## 
## $anxiety
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000 
## 
## $married
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4096  1.0000  1.0000       2 
## 
## $female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.744   1.000   1.000 
## 
## $educat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   3.000   3.359   5.000   5.000       1 
## 
## $iadl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   72.22   94.44   81.82  100.00  100.00 
## 
## $social
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00  100.00  100.00   89.86  100.00  100.00       2 
## 
## $racecat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.297   4.000   4.000 
## 
## $whiteleycat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.456   2.000   3.000 
## 
## $charlson
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5662  1.0000  7.0000 
## 
## $depcat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    3.00    2.72    3.00    3.00
## turn binary and category variables into factors
varsToFactor <- c(catVars, binVars) 

cost[,varsToFactor] <- sapply(cost[,varsToFactor],as.factor)


Normality assessment

cost_quant <- as.data.frame(cost %>% dplyr::select(quantVars))


normalityfun <- function(dataset) {
  require(moments)
  require(e1071)
  
  print(paste0("Summary Statistics: ", names(dataset)[i]))
  print(paste0("Mean: ", mean(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Standard Deviation: ",sd(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Median: ", median(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Skewness: ", skewness(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Kurtosis: ", e1071::kurtosis(as.numeric(dataset[,i]), type = 2)))
  print(paste0("Shapiro-Wilk Test: ", shapiro.test(as.numeric(dataset[,i]))$p.value))
  print(" ")
  print(" ")
  
  qqnorm(as.numeric(dataset[,i]), main = paste0("Normal Q-Q plot: ",names(dataset)[i]))
  qqline(as.numeric(dataset[,i]), col = 2)
  
  
}



for(i in 1:length(quantVars)) {
  
    print(ggplot() + 
            geom_histogram(data = cost_quant, fill = "blue", alpha = 0.5, mapping = 
                             aes(x = get(quantVars[i]))) +
                xlab(quantVars[i]) + labs(title = "Histogram")
    
  )
  
    normalityfun(cost_quant)
  
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: outcosts2"
## [1] "Mean: 1836.48373101952"
## [1] "Standard Deviation: 2195.4502208015"
## [1] "Median: 1016"
## [1] "Skewness: 1.86911992818483"
## [1] "Kurtosis: 3.71673113110437"
## [1] "Shapiro-Wilk Test: 1.32694421506979e-24"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ptage"
## [1] "Mean: 45.1636818141237"
## [1] "Standard Deviation: 15.4858207820523"
## [1] "Median: 44.621492128679"
## [1] "Skewness: 0.422861613908007"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 9.58087935521775e-08"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ssi"
## [1] "Mean: 1.73845513460898"
## [1] "Standard Deviation: 0.693705411185324"
## [1] "Median: 1.53846153846154"
## [1] "Skewness: 1.35710574430674"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.83131200837982e-19"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: iadl"
## [1] "Mean: 81.8185104844541"
## [1] "Standard Deviation: 25.5994105151121"
## [1] "Median: 94.4444444444444"
## [1] "Skewness: -1.54182530496445"
## [1] "Kurtosis: 1.54732926242677"
## [1] "Shapiro-Wilk Test: 3.82337675082766e-26"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: social"
## [1] "Mean: 89.8571774388768"
## [1] "Standard Deviation: 22.5952089828678"
## [1] "Median: 100"
## [1] "Skewness: -2.50691098205916"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.85402296574893e-33"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: charlson"
## [1] "Mean: 0.566160520607375"
## [1] "Standard Deviation: 1.07464059120437"
## [1] "Median: 0"
## [1] "Skewness: 2.57120849300461"
## [1] "Kurtosis: 8.03575657287451"
## [1] "Shapiro-Wilk Test: 1.66716672242903e-31"
## [1] " "
## [1] " "


The only variable that looks close to normal is patient age, which we may be able to use parametric testing by invoking the central limit theorem. Otherwise, we will report median and interquartile ranges.


Missingness

## assess for and remove > 10% missingness
## Set threshold
thr <- 0.1*length(cost$outcosts2)

sapply(cost, function(x) ifelse(sum(is.na(x)) > thr, 1, 0))
##   outcosts2       ptage         ssi       panic     anxiety     married 
##           0           0           0           0           0           0 
##      female      educat        iadl      social     racecat whiteleycat 
##           0           0           0           0           0           0 
##    charlson      depcat 
##           0           0
sapply(cost,function(x) sum(is.na(x)))
##   outcosts2       ptage         ssi       panic     anxiety     married 
##           0           1           6           0           0           2 
##      female      educat        iadl      social     racecat whiteleycat 
##           0           1           0           2           0           0 
##    charlson      depcat 
##           0           0
## Create complete dataset of phenotypic variables
cost <- na.omit(cost)

No variables were removed for missingness. However, ssi has the greatest missingness.

Table 1: Demographics

## Add labels to table

labelled::var_label(cost$outcosts2) <- "Outpatient costs ($)"
labelled::var_label(cost$ptage) <- "Age in years"
labelled::var_label(cost$ssi) <- "Somatic Symptom Index (No. (%))"
labelled::var_label(cost$panic) <- "Panic (No. (%))"
labelled::var_label(cost$anxiety) <- "Anxiety (No. (%))"
labelled::var_label(cost$married) <- "Married (No. (%))"
labelled::var_label(cost$female) <- "Female (No. (%))"
labelled::var_label(cost$educat) <- "Education Level (No. (%))"
labelled::var_label(cost$iadl) <- "IADL score"
labelled::var_label(cost$social) <- "Social Activities Score"
labelled::var_label(cost$racecat) <- "Race (No. (%))"
labelled::var_label(cost$whiteleycat) <- "Whiteley Category"
labelled::var_label(cost$charlson) <- "Charlson index"
labelled::var_label(cost$depcat) <- "Depression Category (No. (%))"




## Group variables

demovars <- names(cost)
demovars
##  [1] "outcosts2"   "ptage"       "ssi"         "panic"       "anxiety"    
##  [6] "married"     "female"      "educat"      "iadl"        "social"     
## [11] "racecat"     "whiteleycat" "charlson"    "depcat"
varsToFactor <- c(catVars, binVars) 

nonNormalVars <- names(cost_quant)[!names(cost_quant) %in% c("ptage")]

## Table one

tableOne <- CreateTableOne(vars = demovars, data = cost, factorVars = varsToFactor, testNonNormal = nonNormalVars)
tableOne
##                        
##                         Overall          
##   n                         449          
##   outcosts2 (mean (sd)) 1829.86 (2205.83)
##   ptage (mean (sd))       45.22 (15.55)  
##   ssi (mean (sd))          1.74 (0.69)   
##   panic = 1 (%)              28 ( 6.2)   
##   anxiety = 1 (%)            28 ( 6.2)   
##   married = 1 (%)           184 (41.0)   
##   female = 1 (%)            332 (73.9)   
##   educat (%)                             
##      1                       39 ( 8.7)   
##      2                      100 (22.3)   
##      3                       91 (20.3)   
##      4                       94 (20.9)   
##      5                      125 (27.8)   
##   iadl (mean (sd))        81.80 (25.72)  
##   social (mean (sd))      89.73 (22.78)  
##   racecat (%)                            
##      1                      174 (38.8)   
##      2                       97 (21.6)   
##      3                       48 (10.7)   
##      4                      130 (29.0)   
##   whiteleycat (%)                        
##      1                      297 (66.1)   
##      2                      100 (22.3)   
##      3                       52 (11.6)   
##   charlson (mean (sd))     0.57 (1.07)   
##   depcat (%)                             
##      1                       45 (10.0)   
##      2                       36 ( 8.0)   
##      3                      368 (82.0)
## Save as csv and word document
tableOne_out <- print(tableOne,quote = FALSE, noSpaces = TRUE, printToggle = FALSE, varLabel = TRUE, nonnormal = nonNormalVars)


write.csv(tableOne_out, file = paste0(analysisDir,"TableOne.csv"))
tableOne_out <- fread(paste0(analysisDir, "TableOne.csv"))

colnames(tableOne_out) <- c("","")

tab_df(tableOne_out, file = paste0(analysisDir, "TableOne.doc"))
n 449
Outpatient costs ($) (median [IQR]) 998.00 [305.00, 2463.00]
Age in years (mean (sd)) 45.22 (15.55)
Somatic Symptom Index (No. (%)) (median [IQR]) 1.54 [1.23, 2.08]
Panic (No. (%)) = 1 (%) 28 (6.2)
Anxiety (No. (%)) = 1 (%) 28 (6.2)
Married (No. (%)) = 1 (%) 184 (41.0)
Female (No. (%)) = 1 (%) 332 (73.9)
Education Level (No. (%)) (%)
1 39 (8.7)
2 100 (22.3)
3 91 (20.3)
4 94 (20.9)
5 125 (27.8)
IADL score (median [IQR]) 94.44 [72.22, 100.00]
Social Activities Score (median [IQR]) 100.00 [100.00, 100.00]
Race (No. (%)) (%)
1 174 (38.8)
2 97 (21.6)
3 48 (10.7)
4 130 (29.0)
Whiteley Category (%)
1 297 (66.1)
2 100 (22.3)
3 52 (11.6)
Charlson index (median [IQR]) 0.00 [0.00, 1.00]
Depression Category (No. (%)) (%)
1 45 (10.0)
2 36 (8.0)
3 368 (82.0)

Scatterplots and correlation coefficients

## Scatterplots and correlation coefficients

for (i in 1:length(explanVars.quant)) {
  
  for (j in 1:length(responseVars.quant)) {
 
     print(paste0("Explanatory var: ", explanVars.quant[i]))
     print(paste0("Response var: ", responseVars.quant[j]))
     
     cormod <- cor.test(as.data.frame(cost)[,explanVars.quant[i]],
                                    as.data.frame(cost)[,responseVars.quant[j]], 
                        method = "spearman")
     
     print(paste0("Spearman Correlation Coefficient: ", signif(cormod$estimate,5)))
     print(paste0("p-value: ", signif(cormod$p.value,5)))
     
     print(" ")
     print(" ")
  
     print(ggplot(data = cost) + 
             geom_point(mapping = aes(x = get(explanVars.quant[i]), 
                                      y = get(responseVars.quant[j]))) +
             geom_smooth(mapping = aes(x = get(explanVars.quant[i]), 
                                       y = get(responseVars.quant[j])), method = 'lm') +
             xlab(explanVars.quant[i]) + ylab(responseVars.quant[j]) +
               labs(subtitle =
                      paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
                             paste0("p-value: ", signif(cormod$p.value,5))))) 
     
     ## plot residuals
     print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) + 
             geom_point(aes(x = .fitted, y = .resid)) + 
             labs(title = paste0("Residuals vs fitted: ",explanVars.quant[i]))) # fitted vs residuals
     
    print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) + 
            geom_histogram(aes(x = .resid)) + 
            labs(title = paste0("Residuals histogram: ",explanVars.quant[i])))       
     

    }
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties
## [1] "Spearman Correlation Coefficient: 0.12543"
## [1] "p-value: 0.0077943"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.28145"
## [1] "p-value: 1.2772e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.28202"
## [1] "p-value: 1.1791e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.16497"
## [1] "p-value: 0.00044822"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.32663"
## [1] "p-value: 1.2696e-12"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


All vars: Scatterplots and correlation coefficients

## Scatterplots and correlation coefficients

for (i in 1:length(explanVars.quant)) {
  
  for (j in 1:length(responseVars.quant)) {
 
     print(paste0("Explanatory var: ", explanVars.quant[i]))
     print(paste0("Response var: ", responseVars.quant[j]))
     
     cormod <- cor.test(as.data.frame(cost)[,explanVars.quant[i]],
                                    as.data.frame(cost)[,responseVars.quant[j]], 
                        method = "spearman")
     
     print(paste0("Spearman Correlation Coefficient: ", signif(cormod$estimate,5)))
     print(paste0("p-value: ", signif(cormod$p.value,5)))
     
     print(" ")
     print(" ")
  
     print(ggplot(data = cost) + 
             geom_point(mapping = aes(x = get(explanVars.quant[i]), 
                                      y = get(responseVars.quant[j]))) +
             geom_smooth(mapping = aes(x = get(explanVars.quant[i]), 
                                       y = get(responseVars.quant[j])), method = 'lm') +
             xlab(explanVars.quant[i]) + ylab(responseVars.quant[j]) +
               labs(subtitle =
                      paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
                             paste0("p-value: ", signif(cormod$p.value,5))))) 
     
     ## plot residuals
     print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) + 
             geom_point(aes(x = .fitted, y = .resid)) + 
             labs(title = paste0("Residuals vs fitted: ",explanVars.quant[i]))) # fitted vs residuals
     
    print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) + 
            geom_histogram(aes(x = .resid)) + 
            labs(title = paste0("Residuals histogram: ",explanVars.quant[i])))       
     

    }
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties
## [1] "Spearman Correlation Coefficient: 0.12543"
## [1] "p-value: 0.0077943"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.28145"
## [1] "p-value: 1.2772e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.28202"
## [1] "p-value: 1.1791e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.16497"
## [1] "p-value: 0.00044822"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.32663"
## [1] "p-value: 1.2696e-12"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable selection

# 
# FS <- regsubsets(as.formula(LinEq), data = cost, method = "forward", force.in = c("depcat","charlson"))
# 
# str(summary(FS))
# str(FS)
# 
# summary(FS)$rss

set.seed(123)

null <- lm(outcosts2~1, data = cost)
full <- lm(outcosts2~., data = cost)

FS <- step(null, scope=list(lower=null, upper=full), direction="forward")
## Start:  AIC=6914.57
## outcosts2 ~ 1
## 
##               Df Sum of Sq        RSS    AIC
## + charlson     1 240985083 1938832720 6864.0
## + racecat      3 214228353 1965589450 6874.1
## + iadl         1 163770786 2016047017 6881.5
## + depcat       2 158562936 2021254867 6884.7
## + ssi          1 133484030 2046333773 6888.2
## + educat       4 127715950 2052101853 6895.5
## + whiteleycat  2  96805662 2083012142 6898.2
## + social       1  81518772 2098299031 6899.5
## + panic        1  34614794 2145203009 6909.4
## + anxiety      1  24581357 2155236447 6911.5
## + ptage        1  20088860 2159728943 6912.4
## + female       1  13917217 2165900586 6913.7
## + married      1  13095811 2166721993 6913.9
## <none>                     2179817803 6914.6
## 
## Step:  AIC=6863.97
## outcosts2 ~ charlson
## 
##               Df Sum of Sq        RSS    AIC
## + racecat      3 152961582 1785871138 6833.1
## + depcat       2 110921296 1827911424 6841.5
## + ssi          1  85455902 1853376818 6845.7
## + iadl         1  82639449 1856193271 6846.4
## + whiteleycat  2  65028251 1873804469 6852.7
## + educat       4  78602638 1860230082 6853.4
## + social       1  45657600 1893175121 6855.3
## + anxiety      1  23073440 1915759280 6860.6
## + panic        1  20666597 1918166124 6861.2
## + female       1  19857339 1918975382 6861.3
## + married      1  18252581 1920580139 6861.7
## <none>                     1938832720 6864.0
## + ptage        1    185287 1938647434 6865.9
## 
## Step:  AIC=6833.07
## outcosts2 ~ charlson + racecat
## 
##               Df Sum of Sq        RSS    AIC
## + depcat       2  66381509 1719489629 6820.1
## + ssi          1  44909757 1740961381 6823.6
## + iadl         1  33904439 1751966699 6826.5
## + whiteleycat  2  31551428 1754319710 6829.1
## + social       1  22803842 1763067296 6829.3
## + anxiety      1  18519335 1767351803 6830.4
## + panic        1  12978761 1772892377 6831.8
## + female       1  11621963 1774249175 6832.1
## <none>                     1785871138 6833.1
## + married      1   4509966 1781361172 6833.9
## + ptage        1    265857 1785605281 6835.0
## + educat       4  20904789 1764966349 6835.8
## 
## Step:  AIC=6820.06
## outcosts2 ~ charlson + racecat + depcat
## 
##               Df Sum of Sq        RSS    AIC
## + ssi          1  21043687 1698445942 6816.5
## + iadl         1  18705072 1700784557 6817.2
## + female       1   8780270 1710709359 6819.8
## <none>                     1719489629 6820.1
## + social       1   6794491 1712695138 6820.3
## + anxiety      1   5224667 1714264962 6820.7
## + panic        1   3947281 1715542348 6821.0
## + married      1   2954262 1716535366 6821.3
## + whiteleycat  2   8535564 1710954064 6821.8
## + ptage        1     23159 1719466469 6822.1
## + educat       4  17188031 1702301597 6823.6
## 
## Step:  AIC=6816.53
## outcosts2 ~ charlson + racecat + depcat + ssi
## 
##               Df Sum of Sq        RSS    AIC
## + female       1   8252837 1690193105 6816.3
## <none>                     1698445942 6816.5
## + iadl         1   5552772 1692893170 6817.1
## + anxiety      1   2362417 1696083525 6817.9
## + married      1   2029243 1696416699 6818.0
## + panic        1   1062844 1697383098 6818.3
## + social       1    693520 1697752422 6818.4
## + ptage        1    513823 1697932119 6818.4
## + whiteleycat  2   4485912 1693960030 6819.3
## + educat       4  16018581 1682427361 6820.3
## 
## Step:  AIC=6816.35
## outcosts2 ~ charlson + racecat + depcat + ssi + female
## 
##               Df Sum of Sq        RSS    AIC
## <none>                     1690193105 6816.3
## + iadl         1   5186509 1685006596 6817.0
## + anxiety      1   2842442 1687350663 6817.6
## + panic        1    944084 1689249021 6818.1
## + married      1    656885 1689536220 6818.2
## + social       1    579968 1689613137 6818.2
## + ptage        1    215038 1689978067 6818.3
## + educat       4  19514216 1670678889 6819.1
## + whiteleycat  2   4182099 1686011007 6819.2
## View best model
summary(FS)
## 
## Call:
## lm(formula = outcosts2 ~ charlson + racecat + depcat + ssi + 
##     female, data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3808.1 -1200.9  -462.7   635.5  9036.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1054.88     516.32   2.043   0.0416 *  
## charlson      544.88      88.82   6.135 1.90e-09 ***
## racecat2      272.10     253.73   1.072   0.2841    
## racecat3      -16.91     324.81  -0.052   0.9585    
## racecat4     -920.45     232.02  -3.967 8.49e-05 ***
## depcat2       909.12     444.18   2.047   0.0413 *  
## depcat3      -282.31     339.11  -0.833   0.4056    
## ssi           347.03     150.16   2.311   0.0213 *  
## female1       313.07     213.59   1.466   0.1434    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1960 on 440 degrees of freedom
## Multiple R-squared:  0.2246, Adjusted R-squared:  0.2105 
## F-statistic: 15.93 on 8 and 440 DF,  p-value: < 2.2e-16
## Create vector of selected variables
selectedVars <- rownames(summary(FS)$coefficients)
selectedVars <- selectedVars[!selectedVars %in% c("(Intercept)")]
selectedVars
## [1] "charlson" "racecat2" "racecat3" "racecat4" "depcat2"  "depcat3" 
## [7] "ssi"      "female1"
summary(FS)$coefficients
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 1054.88163  516.32254  2.0430672 4.164106e-02
## charlson     544.87679   88.81502  6.1349624 1.900093e-09
## racecat2     272.09682  253.73241  1.0723771 2.841385e-01
## racecat3     -16.90764  324.81400 -0.0520533 9.585098e-01
## racecat4    -920.44524  232.01796 -3.9671292 8.491096e-05
## depcat2      909.11563  444.18219  2.0467179 4.127942e-02
## depcat3     -282.31156  339.10590 -0.8325174 4.055686e-01
## ssi          347.02618  150.16027  2.3110386 2.129213e-02
## female1      313.07143  213.59134  1.4657497 1.434307e-01
## do by p-value
# OLS.Mod <- paste0("outcosts2~depcat+charlson+",paste0(names(cost)[!names(cost) %in% c("charlson","depcat","outcosts2")], collapse = "+"))
# OLS <- ols(as.formula(OLS.Mod), data = cost)
# 
# FS.p <- fastbw(OLS, rule = "p", sls = 0.05, force = 4)

Multicollinearity: variance inflation factors

selectedVars <- unique(tstrsplit(selectedVars, "(?=[A-Za-z])(?<=[0-9])|(?=[0-9])(?<=[A-Za-z])", perl=TRUE)[[1]])

LinEq <- paste0("outcosts2~",paste0(selectedVars, collapse = "+"))
LinEq
## [1] "outcosts2~charlson+racecat+depcat+ssi+female"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3808.1 -1200.9  -462.7   635.5  9036.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1054.88     516.32   2.043   0.0416 *  
## charlson      544.88      88.82   6.135 1.90e-09 ***
## racecat2      272.10     253.73   1.072   0.2841    
## racecat3      -16.91     324.81  -0.052   0.9585    
## racecat4     -920.45     232.02  -3.967 8.49e-05 ***
## depcat2       909.12     444.18   2.047   0.0413 *  
## depcat3      -282.31     339.11  -0.833   0.4056    
## ssi           347.03     150.16   2.311   0.0213 *  
## female1       313.07     213.59   1.466   0.1434    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1960 on 440 degrees of freedom
## Multiple R-squared:  0.2246, Adjusted R-squared:  0.2105 
## F-statistic: 15.93 on 8 and 440 DF,  p-value: < 2.2e-16
vif(LinMod)
##              GVIF Df GVIF^(1/(2*Df))
## charlson 1.060233  1        1.029676
## racecat  1.154487  3        1.024232
## depcat   1.306560  2        1.069135
## ssi      1.268000  1        1.126055
## female   1.027451  1        1.013633
## Remove if VIF > 10 - none in this case

## Now create table for adjusted coefficients
coefTable <- 
  data.frame(
    variable = 
      rownames(summary(LinMod)$coefficients)[2:length(summary(LinMod)$coefficients[,1])], 
    coefficient = 
      as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),1]), 
    LCI = 
      as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),1]), 
    UCI = as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),2]), 
    pvalue = 
               signif(as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),4]),9))

kable(coefTable)
variable coefficient LCI UCI pvalue
charlson 544.87679 370.32241 719.4312 0.0000000
racecat2 272.09682 -226.58128 770.7749 0.2841385
racecat3 -16.90764 -655.28737 621.4721 0.9585098
racecat4 -920.44524 -1376.44641 -464.4441 0.0000849
depcat2 909.11563 36.13322 1782.0980 0.0412794
depcat3 -282.31156 -948.78016 384.1570 0.4055686
ssi 347.02618 51.90568 642.1467 0.0212921
female1 313.07143 -106.71460 732.8575 0.1434307
## Now remove SSI which is likely an intermediary variable

LinEq.int <- paste0("outcosts2~",paste0(selectedVars[!selectedVars %in% c("ssi")], collapse = "+"))
LinEq.int
## [1] "outcosts2~charlson+racecat+depcat+female"
LinMod.int <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3808.1 -1200.9  -462.7   635.5  9036.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1054.88     516.32   2.043   0.0416 *  
## charlson      544.88      88.82   6.135 1.90e-09 ***
## racecat2      272.10     253.73   1.072   0.2841    
## racecat3      -16.91     324.81  -0.052   0.9585    
## racecat4     -920.45     232.02  -3.967 8.49e-05 ***
## depcat2       909.12     444.18   2.047   0.0413 *  
## depcat3      -282.31     339.11  -0.833   0.4056    
## ssi           347.03     150.16   2.311   0.0213 *  
## female1       313.07     213.59   1.466   0.1434    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1960 on 440 degrees of freedom
## Multiple R-squared:  0.2246, Adjusted R-squared:  0.2105 
## F-statistic: 15.93 on 8 and 440 DF,  p-value: < 2.2e-16
vif(LinMod.int)
##              GVIF Df GVIF^(1/(2*Df))
## charlson 1.060233  1        1.029676
## racecat  1.154487  3        1.024232
## depcat   1.306560  2        1.069135
## ssi      1.268000  1        1.126055
## female   1.027451  1        1.013633

Univariate OLS

coefTable.uni <- data.frame(variable = character(),coefficient = numeric(), LCI = numeric(), UCI = numeric(), pvalue = numeric())

for(i in 1:length(selectedVars)) {
  
  
  LinEq.uni <- paste0("outcosts2~",selectedVars[i])
  LinEq.uni

  LinMod.uni <- lm(as.formula(LinEq.uni), data = cost)

  coefTable.uni <- rbind(coefTable.uni, data.frame(variable = selectedVars[i], 
                            coefficient = as.numeric(summary(LinMod.uni)$coefficients[2,1]), 
                            LCI = confint(LinMod.uni)[2,1], 
                            UCI = confint(LinMod.uni)[2,2], 
                            pvalue = signif(summary(LinMod.uni)$coefficients[2,4], 9)
                       ))

}


kable(coefTable.uni)
variable coefficient LCI UCI pvalue
charlson 683.1861 503.056181 863.3161 0.0000000
racecat 524.9870 1.601816 1048.3722 0.0493059
depcat 938.2778 2.750733 1873.8048 0.0493330
ssi 786.0828 499.985526 1072.1800 0.0000001
female 401.0859 -64.020461 866.1923 0.0908156

Table 2: coefficients

table2 <- coefTable
  # bind_cols(coefTable.uni,coefTable)
# table2 <- as.data.frame(sapply(table2 %>% dplyr::select(-variable,-variable1), function(x) signif(x, 5)))
kable(table2)
variable coefficient LCI UCI pvalue
charlson 544.87679 370.32241 719.4312 0.0000000
racecat2 272.09682 -226.58128 770.7749 0.2841385
racecat3 -16.90764 -655.28737 621.4721 0.9585098
racecat4 -920.44524 -1376.44641 -464.4441 0.0000849
depcat2 909.11563 36.13322 1782.0980 0.0412794
depcat3 -282.31156 -948.78016 384.1570 0.4055686
ssi 347.02618 51.90568 642.1467 0.0212921
female1 313.07143 -106.71460 732.8575 0.1434307
## if p < 0.0001
table2[,c("pvalue")] <- sapply(table2[,c("pvalue")], function(x) ifelse(x < 0.0001, "< 0.0001",signif(x,5)))


## Create vector rownames and extract labels
# variableNames <- selectedVars
# 
# variableNameLabels <- vector(length = length(variableNames))
# 
# for(y in 1:length(variableNameLabels)) {
#   
#   variableNameLabels[y] <- labelled::var_label(cost[,variableNames[y]])
# }


## Collapse 95% CIs into cell with hazard ratios
# 
# table2 <- table2 %>% mutate(variable = variableNameLabels, 
#                        Coef1 = paste0(coefficient," (",LCI,",",UCI,")"), 
#                        Coef2 = paste0(coefficient1," (",LCI1,",",UCI1,")")) %>% dplyr::select(variable,Coef1,pvalue,Coef2,pvalue1)

table2 <- table2 %>% mutate(variable = variable, coefficient = paste0(signif(coefficient,5),"(",signif(LCI,5),",",signif(UCI,5),")"), pvalue = pvalue)
## Warning: package 'bindrcpp' was built under R version 3.4.4
colnames(table2) <- c("Variable", "Unadjusted Coefficient (95% CI)","p-value","Adjusted Coefficient (95% CI)","p-value")

kable(table2)
Variable Unadjusted Coefficient (95% CI) p-value Adjusted Coefficient (95% CI) p-value
charlson 544.88(370.32,719.43) 370.32241 719.4312 < 0.0001
racecat2 272.1(-226.58,770.77) -226.58128 770.7749 0.28414
racecat3 -16.908(-655.29,621.47) -655.28737 621.4721 0.95851
racecat4 -920.45(-1376.4,-464.44) -1376.44641 -464.4441 < 0.0001
depcat2 909.12(36.133,1782.1) 36.13322 1782.0980 0.041279
depcat3 -282.31(-948.78,384.16) -948.78016 384.1570 0.40557
ssi 347.03(51.906,642.15) 51.90568 642.1467 0.021292
female1 313.07(-106.71,732.86) -106.71460 732.8575 0.14343
tab_df(table2, file = paste0(analysisDir,"Table2.doc"))
Variable Unadjusted Coefficient (95% CI) p-value Adjusted Coefficient (95% CI) p-value
charlson 544.88(370.32,719.43) 370.322411073949 719.431171800222 < 0.0001
racecat2 272.1(-226.58,770.77) -226.58128068612 770.774913529939 0.28414
racecat3 -16.908(-655.29,621.47) -655.287374936688 621.472095005024 0.95851
racecat4 -920.45(-1376.4,-464.44) -1376.44640656538 -464.444067057424 < 0.0001
depcat2 909.12(36.133,1782.1) 36.1332221228739 1782.09804417907 0.041279
depcat3 -282.31(-948.78,384.16) -948.780163442076 384.157045465912 0.40557
ssi 347.03(51.906,642.15) 51.9056756009429 642.146691484815 0.021292
female1 313.07(-106.71,732.86) -106.714604524497 732.85747440742 0.14343

Effect modification

## Evaluate for effect modification by adding an interaction term
# cost <- cost %>% mutate(chdep = charlson*depcat)
# labelled::var_label(cost$chdep) <- 'Charlson*Depression Category Interaction Term' # label this new term

## Retrain the model with the new interaction term
LinEq <- paste0("outcosts2~",paste0(c(selectedVars), collapse = "+"),"+depcat*charlson")
LinEq
## [1] "outcosts2~charlson+racecat+depcat+ssi+female+depcat*charlson"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3893.4 -1214.6  -463.0   666.4  9055.0 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1044.72     540.11   1.934   0.0537 .  
## charlson           562.42     226.90   2.479   0.0136 *  
## racecat2           258.06     254.16   1.015   0.3105    
## racecat3           -28.67     327.69  -0.087   0.9303    
## racecat4          -931.26     232.60  -4.004 7.32e-05 ***
## depcat2            673.12     533.43   1.262   0.2077    
## depcat3           -239.17     388.77  -0.615   0.5387    
## ssi                350.66     150.36   2.332   0.0201 *  
## female1            305.93     213.83   1.431   0.1532    
## charlson:depcat2   250.77     332.31   0.755   0.4509    
## charlson:depcat3   -71.07     249.94  -0.284   0.7763    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1961 on 438 degrees of freedom
## Multiple R-squared:  0.2273, Adjusted R-squared:  0.2096 
## F-statistic: 12.88 on 10 and 438 DF,  p-value: < 2.2e-16
vif(LinMod)
##                      GVIF Df GVIF^(1/(2*Df))
## charlson         6.912099  1        2.629087
## racecat          1.180298  3        1.028013
## depcat           2.448953  2        1.250965
## ssi              1.269971  1        1.126930
## female           1.028575  1        1.014187
## charlson:depcat 10.274368  2        1.790353
## Now remove SSI

## Retrain the model with the new interaction term
LinEq <- paste0("outcosts2~",paste0(c(selectedVars[!selectedVars %in% c("ssi")]), collapse = "+"),"+depcat*charlson")
LinEq
## [1] "outcosts2~charlson+racecat+depcat+female+depcat*charlson"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3463.0 -1232.2  -474.2   634.5  9339.5 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1856.86     414.93   4.475 9.74e-06 ***
## charlson           597.74     227.54   2.627  0.00892 ** 
## racecat2           311.60     254.39   1.225  0.22127    
## racecat3           -14.05     329.28  -0.043  0.96598    
## racecat4          -979.14     232.86  -4.205 3.17e-05 ***
## depcat2            606.81     535.36   1.133  0.25764    
## depcat3           -501.71     373.99  -1.341  0.18046    
## female1            316.39     214.86   1.473  0.14159    
## charlson:depcat2   220.53     333.73   0.661  0.50909    
## charlson:depcat3   -87.75     251.10  -0.349  0.72692    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1971 on 439 degrees of freedom
## Multiple R-squared:  0.2177, Adjusted R-squared:  0.2016 
## F-statistic: 13.57 on 9 and 439 DF,  p-value: < 2.2e-16
vif(LinMod)
##                      GVIF Df GVIF^(1/(2*Df))
## charlson         6.881309  1        2.623225
## racecat          1.150483  3        1.023639
## depcat           2.186324  2        1.215986
## female           1.028123  1        1.013964
## charlson:depcat 10.258419  2        1.789658

Figure 1: plot of outpatient costs vs. depression category grouped by depression category

## plot line of best fit for outpatient costs vs charlson grouped by depression category

ggplot(data = cost[,c("outcosts2",selectedVars)], 
       aes(x = charlson, y = outcosts2,group = as.factor(depcat), color = as.factor(depcat))) + 
  geom_smooth(method = 'lm') + 
  ggtitle(" ") + xlab("Charlson Index") + ylab("Outpatient Costs") + 
  labs(color = "Depression \nCategory\n") + 
  scale_color_manual(labels = c("Major","Minor","None"), values = c("red","blue","dark green"))

## Save file
pdf(paste0(analysisDir,"Figure1.pdf"))

ggplot(data = cost[,c("outcosts2",selectedVars)], 
       aes(x = charlson, y = outcosts2,group = as.factor(depcat), color = as.factor(depcat))) + 
  geom_smooth(method = 'lm') + 
  ggtitle(" ") + xlab("Charlson Index") + ylab("Outpatient Costs") + 
  labs(color = "Depression \nCategory\n") + 
  scale_color_manual(labels = c("Major","Minor","None"), values = c("red","blue","dark green"))
 

dev.off()
## quartz_off_screen 
##                 2

Session Info

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2      leaps_3.0           car_3.0-0          
##  [4] carData_3.0-1       stargazer_5.2.1     sjPlot_2.4.1       
##  [7] tableone_0.9.3      ppcor_1.1           MASS_7.3-49        
## [10] lmSupport_2.9.13    lsmeans_2.27-62     e1071_1.6-8        
## [13] epiR_0.9-97         survival_2.41-3     moments_0.14       
## [16] readxl_1.0.0        knitr_1.20          data.table_1.10.4-3
## [19] forcats_0.3.0       stringr_1.3.0       dplyr_0.7.4        
## [22] purrr_0.2.4         readr_1.1.1         tidyr_0.8.0        
## [25] tibble_1.4.2        ggplot2_2.2.1       tidyverse_1.2.1    
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.2    blme_1.0-4         VGAM_1.0-6        
##   [4] plyr_1.8.4         lazyeval_0.2.1     sp_1.2-7          
##   [7] TMB_1.7.13         splines_3.4.3      unmarked_0.12-2   
##  [10] TH.data_1.0-8      digest_0.6.15      htmltools_0.3.6   
##  [13] gdata_2.18.0       magrittr_1.5       AICcmodavg_2.1-1  
##  [16] openxlsx_4.0.17    modelr_0.1.1       sandwich_2.4-0    
##  [19] colorspace_1.3-2   rvest_0.3.2        BiasedUrn_1.07    
##  [22] haven_1.1.1        crayon_1.3.4       jsonlite_1.5      
##  [25] lme4_1.1-17        bindr_0.1.1        zoo_1.8-1         
##  [28] glue_1.2.0         gtable_0.2.0       emmeans_1.1.3     
##  [31] sjstats_0.14.2-3   sjmisc_2.7.1       abind_1.4-5       
##  [34] scales_0.5.0       mvtnorm_1.0-7      ggeffects_0.3.2   
##  [37] Rcpp_0.12.16       xtable_1.8-2       merTools_0.3.0    
##  [40] foreign_0.8-69     stats4_3.4.3       prediction_0.2.0  
##  [43] survey_3.33-2      DT_0.4             htmlwidgets_1.0   
##  [46] httr_1.3.1         gplots_3.0.1       modeltools_0.2-21 
##  [49] pkgconfig_2.0.1    reshape_0.8.7      nnet_7.3-12       
##  [52] utf8_1.1.3         tidyselect_0.2.4   labeling_0.3      
##  [55] rlang_0.2.0        reshape2_1.4.3     later_0.7.4       
##  [58] munsell_0.4.3      cellranger_1.1.0   tools_3.4.3       
##  [61] cli_1.0.0          sjlabelled_1.0.8   broom_0.4.4       
##  [64] ggridges_0.5.0     evaluate_0.10.1    arm_1.9-3         
##  [67] yaml_2.1.18        caTools_1.17.1     coin_1.2-2        
##  [70] nlme_3.1-137       mime_0.5           xml2_1.2.0        
##  [73] compiler_3.4.3     pbkrtest_0.4-7     bayesplot_1.5.0   
##  [76] rstudioapi_0.7     curl_3.2           stringi_1.1.7     
##  [79] highr_0.6          lattice_0.20-35    Matrix_1.2-13     
##  [82] psych_1.8.3.3      nloptr_1.0.4       effects_4.0-1     
##  [85] stringdist_0.9.4.7 pillar_1.2.1       pwr_1.2-2         
##  [88] lmtest_0.9-36      estimability_1.3   bitops_1.0-6      
##  [91] raster_2.6-7       httpuv_1.4.5       R6_2.2.2          
##  [94] promises_1.0.1     KernSmooth_2.23-15 rio_0.5.10        
##  [97] codetools_0.2-15   gtools_3.5.0       assertthat_0.2.0  
## [100] rprojroot_1.3-2    mnormt_1.5-5       multcomp_1.4-8    
## [103] parallel_3.4.3     hms_0.4.2          grid_3.4.3        
## [106] labelled_1.0.1     coda_0.19-1        glmmTMB_0.2.0     
## [109] class_7.3-14       minqa_1.2.4        rmarkdown_1.9     
## [112] snakecase_0.9.1    gvlma_1.0.0.2      shiny_1.1.0       
## [115] lubridate_1.7.3